library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)
Data from the speech features
TADPOLE_D1_D2 <- read.csv("~/GitHub/BSWiMS/Data/TADPOLE/TADPOLE_D1_D2.csv")
TADPOLE_D1_D2_Dict <- read.csv("~/GitHub/BSWiMS/Data/TADPOLE/TADPOLE_D1_D2_Dict.csv")
TADPOLE_D1_D2_Dict_LR <- as.data.frame(read_excel("~/GitHub/BSWiMS/Data/TADPOLE/TADPOLE_D1_D2_Dict_LR.xlsx",sheet = "LeftRightFeatures"))
rownames(TADPOLE_D1_D2_Dict) <- TADPOLE_D1_D2_Dict$FLDNAME
# mm3 to mm
isVolume <- c("Ventricles","Hippocampus","WholeBrain","Entorhinal","Fusiform","MidTemp","ICV",
TADPOLE_D1_D2_Dict$FLDNAME[str_detect(TADPOLE_D1_D2_Dict$TEXT,"Volume")]
)
#TADPOLE_D1_D2[,isVolume] <- apply(TADPOLE_D1_D2[,isVolume],2,'^',(1/3))
TADPOLE_D1_D2[,isVolume] <- TADPOLE_D1_D2[,isVolume]^(1/3)
# mm2 to mm
isArea <- TADPOLE_D1_D2_Dict$FLDNAME[str_detect(TADPOLE_D1_D2_Dict$TEXT,"Area")]
TADPOLE_D1_D2[,isArea] <- sqrt(TADPOLE_D1_D2[,isArea])
# Get only cross sectional measurements
FreeSurfersetCross <- str_detect(colnames(TADPOLE_D1_D2),"UCSFFSX")
# The subset of baseline measurements
baselineTadpole <- subset(TADPOLE_D1_D2,VISCODE=="bl")
table(baselineTadpole$DX)
Dementia Dementia to MCI MCI MCI to Dementia
7 336 1 864 5
MCI to NL NL NL to MCI
2 521 1
table(baselineTadpole$DX_bl)
AD CN EMCI LMCI SMC 342 417 310 562 106
rownames(baselineTadpole) <- baselineTadpole$PTID
validBaselineTadpole <- cbind(DX=baselineTadpole$DX_bl,
AGE=baselineTadpole$AGE,
Gender=1*(baselineTadpole$PTGENDER=="Female"),
ADAS11=baselineTadpole$ADAS11,
ADAS13=baselineTadpole$ADAS13,
MMSE=baselineTadpole$MMSE,
RAVLT_immediate=baselineTadpole$RAVLT_immediate,
RAVLT_learning=baselineTadpole$RAVLT_learning,
RAVLT_forgetting=baselineTadpole$RAVLT_forgetting,
RAVLT_perc_forgetting=baselineTadpole$RAVLT_perc_forgetting,
FAQ=baselineTadpole$FAQ,
Ventricles=baselineTadpole$Ventricles,
Hippocampus=baselineTadpole$Hippocampus,
WholeBrain=baselineTadpole$WholeBrain,
Entorhinal=baselineTadpole$Entorhinal,
Fusiform=baselineTadpole$Fusiform,
MidTemp=baselineTadpole$MidTemp,
ICV=baselineTadpole$ICV,
baselineTadpole[,FreeSurfersetCross])
LeftFields <- TADPOLE_D1_D2_Dict_LR$LFN
names(LeftFields) <- LeftFields
LeftFields <- LeftFields[LeftFields %in% colnames(validBaselineTadpole)]
RightFields <- TADPOLE_D1_D2_Dict_LR$RFN
names(RightFields) <- RightFields
RightFields <- RightFields[RightFields %in% colnames(validBaselineTadpole)]
## Normalize to ICV
validBaselineTadpole$Ventricles=validBaselineTadpole$Ventricles/validBaselineTadpole$ICV
validBaselineTadpole$Hippocampus=validBaselineTadpole$Hippocampus/validBaselineTadpole$ICV
validBaselineTadpole$WholeBrain=validBaselineTadpole$WholeBrain/validBaselineTadpole$ICV
validBaselineTadpole$Entorhinal=validBaselineTadpole$Entorhinal/validBaselineTadpole$ICV
validBaselineTadpole$Fusiform=validBaselineTadpole$Fusiform/validBaselineTadpole$ICV
validBaselineTadpole$MidTemp=validBaselineTadpole$MidTemp/validBaselineTadpole$ICV
leftData <- validBaselineTadpole[,LeftFields]/validBaselineTadpole$ICV
RightData <- validBaselineTadpole[,RightFields]/validBaselineTadpole$ICV
## get mean and relative difference
meanLeftRight <- (leftData + RightData)/2
difLeftRight <- abs(leftData - RightData)
reldifLeftRight <- difLeftRight/meanLeftRight
colnames(meanLeftRight) <- paste("M",colnames(meanLeftRight),sep="_")
colnames(difLeftRight) <- paste("D",colnames(difLeftRight),sep="_")
colnames(reldifLeftRight) <- paste("RD",colnames(reldifLeftRight),sep="_")
validBaselineTadpole <- validBaselineTadpole[,!(colnames(validBaselineTadpole) %in%
c(LeftFields,RightFields))]
#validBaselineTadpole <- cbind(validBaselineTadpole,meanLeftRight,difLeftRight,reldifLeftRight)
validBaselineTadpole <- cbind(validBaselineTadpole,meanLeftRight,difLeftRight)
## Remove columns with too many NA more than %15 of NA
nacount <- apply(is.na(validBaselineTadpole),2,sum)/nrow(validBaselineTadpole) < 0.15
diagnose <- validBaselineTadpole$DX
pander::pander(table(diagnose))
| AD | CN | EMCI | LMCI | SMC |
|---|---|---|---|---|
| 342 | 417 | 310 | 562 | 106 |
validBaselineTadpole <- validBaselineTadpole[,nacount]
## Remove character columns
ischar <- sapply(validBaselineTadpole,class) == "character"
validBaselineTadpole <- validBaselineTadpole[,!ischar]
## Place back diagnose
validBaselineTadpole$DX <- diagnose
validBaselineTadpole <- validBaselineTadpole[complete.cases(validBaselineTadpole),]
ischar <- sapply(validBaselineTadpole,class) == "character"
validBaselineTadpole[,!ischar] <- sapply(validBaselineTadpole[,!ischar],as.numeric)
colnames(validBaselineTadpole) <- str_remove_all(colnames(validBaselineTadpole),"_UCSFFSX_11_02_15_UCSFFSX51_08_01_16")
colnames(validBaselineTadpole) <- str_replace_all(colnames(validBaselineTadpole)," ","_")
validBaselineTadpole$LONISID <- NULL
validBaselineTadpole$IMAGEUID <- NULL
validBaselineTadpole$LONIUID <- NULL
diagnose <- as.character(validBaselineTadpole$DX)
validBaselineTadpole$DX <- diagnose
pander::pander(table(validBaselineTadpole$DX))
| AD | CN | EMCI | LMCI | SMC |
|---|---|---|---|---|
| 245 | 359 | 272 | 444 | 93 |
validBaselineTadpole[validBaselineTadpole$DX %in% c("EMCI","LMCI"),"DX"] <- "MCI"
validBaselineTadpole[validBaselineTadpole$DX %in% c("CN","SMC"),"DX"] <- "NL"
pander::pander(table(validBaselineTadpole$DX))
| AD | MCI | NL |
|---|---|---|
| 245 | 716 | 452 |
subjectsID <- rownames(validBaselineTadpole)
visitsID <- unique(TADPOLE_D1_D2$VISCODE)
baseDx <- TADPOLE_D1_D2[TADPOLE_D1_D2$VISCODE=="bl",c("PTID","DX","EXAMDATE")]
rownames(baseDx) <- baseDx$PTID
baseDx <- baseDx[subjectsID,]
lastDx <- baseDx
toDementia <- baseDx
table(lastDx$DX)
Dementia Dementia to MCI MCI MCI to Dementia MCI to NL
244 1 711 2 2
NL NL to MCI
452 1
hasDementia <- lastDx$PTID[str_detect(lastDx$DX,"Dementia")]
for (vid in visitsID)
{
DxValue <- TADPOLE_D1_D2[TADPOLE_D1_D2$VISCODE==vid,c("PTID","DX","EXAMDATE")]
rownames(DxValue) <- DxValue$PTID
DxValue <- DxValue[DxValue$PTID %in% subjectsID,]
noDX <- DxValue$PTID[nchar(DxValue$DX) < 1]
print(length(noDX))
DxValue[noDX,] <- lastDx[noDX,]
inLast <- lastDx$PTID[lastDx$PTID %in% DxValue$PTID]
print(length(inLast))
lastDx[inLast,] <- DxValue[inLast,]
noDementia <- !(toDementia$PTID %in% hasDementia)
toDementia[noDementia,] <- lastDx[noDementia,]
hasDementia <- unique(c(hasDementia,lastDx$PTID[str_detect(lastDx$DX,"Dementia")]))
}
[1] 0 [1] 1413 [1] 2 [1] 1326 [1] 6 [1] 1218 [1] 23 [1] 1095 [1] 805 [1] 1058 [1] 29 [1] 710 [1] 20 [1] 212 [1] 14 [1] 167 [1] 32 [1] 553 [1] 25 [1] 298 [1] 18 [1] 130 [1] 667 [1] 667 [1] 112 [1] 112 [1] 176 [1] 176 [1] 177 [1] 177 [1] 625 [1] 625 [1] 251 [1] 251 [1] 159 [1] 159 [1] 7 [1] 7 [1] 17 [1] 99 [1] 9 [1] 63 [1] 1 [1] 1
table(lastDx$DX)
Dementia Dementia to MCI MCI MCI to Dementia MCI to NL
428 2 463 80 7
NL NL to Dementia NL to MCI
406 1 26
baseMCI <-baseDx$PTID[baseDx$DX == "MCI"]
lastDementia <- lastDx$PTID[str_detect(lastDx$DX,"Dementia")]
lastDementia2 <- toDementia$PTID[str_detect(toDementia$DX,"Dementia")]
lastNL <- lastDx$PTID[str_detect(lastDx$DX,"NL")]
MCIatBaseline <- baseDx[baseMCI,]
MCIatEvent <- toDementia[baseMCI,]
MCIatLast <- lastDx[baseMCI,]
MCIconverters <- MCIatBaseline[baseMCI %in% lastDementia,]
MCI_No_converters <- MCIatBaseline[!(baseMCI %in% MCIconverters$PTID),]
MCIconverters$TimeToEvent <- (as.Date(toDementia[MCIconverters$PTID,"EXAMDATE"])
- as.Date(MCIconverters$EXAMDATE))
sum(MCIconverters$TimeToEvent ==0)
[1] 0
MCIconverters$AtEventDX <- MCIatEvent[MCIconverters$PTID,"DX"]
MCIconverters$LastDX <- MCIatLast[MCIconverters$PTID,"DX"]
MCI_No_converters$TimeToEvent <- (as.Date(lastDx[MCI_No_converters$PTID,"EXAMDATE"])
- as.Date(MCI_No_converters$EXAMDATE))
MCI_No_converters$LastDX <- MCIatLast[MCI_No_converters$PTID,"DX"]
MCI_No_converters <- subset(MCI_No_converters,TimeToEvent > 0)
MCIPrognosisIDs <- c(MCIconverters$PTID,MCI_No_converters$PTID)
TADPOLECrossMRI <- validBaselineTadpole[MCIPrognosisIDs,]
table(TADPOLECrossMRI$DX)
MCI 680
TADPOLECrossMRI$DX <- NULL
TADPOLECrossMRI$status <- 1*(rownames(TADPOLECrossMRI) %in% MCIconverters$PTID)
table(TADPOLECrossMRI$status)
0 1 436 244
studyName <- "TADPOLE"
dataframe <- TADPOLECrossMRI
outcome <- "status"
TopVariables <- 10
pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
| rows | col |
|---|---|
| 680 | 327 |
pander::pander(table(dataframe[,outcome]))
| 0 | 1 |
|---|---|
| 436 | 244 |
varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]
dataframe <- FRESAScale(dataframe,method="OrderLogit")$scaledData
hm <- heatMaps(data=dataframe,
Outcome=outcome,
Scale=TRUE,
hCluster = "row",
xlab="Feature",
ylab="Sample",
cexCol=0.15,
cexRow=0.25
)
par(op)
The heat map of the data
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
cormat <- cor(dataframe[,varlist],method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0
gplots::heatmap.2(abs(cormat),
trace = "none",
# scale = "row",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Original Correlation",
cexRow = 0.15,
cexCol = 0.15,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
DEdataframe <- IDeA(dataframe)
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]
pander::pander(sum(apply(dataframe[,varlist],2,var)))
288
pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))
266
pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))
4.81
pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))
4.76
par(cex=0.6,cex.main=0.85,cex.axis=0.7)
UPSTM <- attr(DEdataframe,"UPSTM")
gplots::heatmap.2(1.0*(abs(UPSTM)>0),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Decorrelation matrix",
cexRow = 0.15,
cexCol = 0.15,
key.title=NA,
key.xlab="|Beta|>0",
xlab="Output Feature", ylab="Input Feature")
par(op)
cormat <- cor(DEdataframe[,varlistc],method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0
print(max(abs(cormat)))
[1] 0.7992567
gplots::heatmap.2(abs(cormat),
trace = "none",
mar = c(5,5),
col=rev(heat.colors(5)),
main = "Correlation after IDeA",
cexRow = 0.15,
cexCol = 0.15,
key.title=NA,
key.xlab="Pearson Correlation",
xlab="Feature", ylab="Feature")
par(op)
classes <- unique(dataframe[,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(scale(dataframe[,varlist]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[,outcome],col=raincolors[dataframe[,outcome]+1])
datasetframe.umap = umap(scale(DEdataframe[,varlistc]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[,outcome],col=raincolors[DEdataframe[,outcome]+1])
univarRAW <- uniRankVar(varlist,
paste(outcome,"~1"),
outcome,
dataframe,
rankingTest="AUC")
100 : M_ST24SA 200 : D_ST49TA 300 : D_ST47CV
univarDe <- uniRankVar(varlistc,
paste(outcome,"~1"),
outcome,
DEdataframe,
rankingTest="AUC",
)
100 : M_ST24SA 200 : D_ST49TA 300 : La_D_ST47CV
univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")
##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| ADAS13 | 0.601 | 0.778 | -0.2712 | 0.755 | 0.01704 | 0.788 |
| ADAS11 | 0.614 | 0.924 | -0.2621 | 0.825 | 0.00086 | 0.761 |
| FAQ | 0.832 | 1.082 | -0.0134 | 0.724 | 0.00000 | 0.756 |
| M_ST40CV | -0.579 | 0.905 | 0.2366 | 0.821 | 0.30865 | 0.750 |
| M_ST29SV | -0.501 | 0.802 | 0.2761 | 0.859 | 0.31867 | 0.745 |
| M_ST12SV | -0.546 | 0.844 | 0.2413 | 0.883 | 0.35129 | 0.744 |
| Hippocampus | -0.489 | 0.805 | 0.2658 | 0.869 | 0.22638 | 0.737 |
| RAVLT_immediate | -0.374 | 0.704 | 0.3807 | 0.979 | 0.02525 | 0.728 |
| M_ST24CV | -0.568 | 0.959 | 0.2072 | 0.876 | 0.08763 | 0.727 |
| M_ST31CV | -0.496 | 0.889 | 0.2225 | 0.864 | 0.74725 | 0.717 |
finalTable <- univarDe$orderframe[topvar,univariate_columns]
pander::pander(univarDe$orderframe[topvar,univariate_columns])
| caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | |
|---|---|---|---|---|---|---|
| ADAS11 | 0.614 | 0.924 | -0.2621 | 0.825 | 0.00086 | 0.761 |
| FAQ | 0.832 | 1.082 | -0.0134 | 0.724 | 0.00000 | 0.756 |
| M_ST40CV | -0.579 | 0.905 | 0.2366 | 0.821 | 0.30865 | 0.750 |
| M_ST12SV | -0.546 | 0.844 | 0.2413 | 0.883 | 0.35129 | 0.744 |
| Hippocampus | -0.489 | 0.805 | 0.2658 | 0.869 | 0.22638 | 0.737 |
| RAVLT_immediate | -0.374 | 0.704 | 0.3807 | 0.979 | 0.02525 | 0.728 |
| M_ST24CV | -0.568 | 0.959 | 0.2072 | 0.876 | 0.08763 | 0.727 |
| M_ST31CV | -0.496 | 0.889 | 0.2225 | 0.864 | 0.74725 | 0.717 |
| M_ST24TA | -0.557 | 0.902 | 0.1400 | 0.837 | 0.06250 | 0.717 |
| WholeBrain | -0.471 | 0.802 | 0.1953 | 0.855 | 0.21185 | 0.715 |
dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")
theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)
pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
| mean | total | fraction |
|---|---|---|
| 2.21 | 33 | 0.101 |
dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
coef <- theFormulas[[dx]]
cname <- names(theFormulas[[dx]])
names(cname) <- cname
for (cf in names(coef))
{
if (cf != dx)
{
if (coef[cf]>0)
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
}
else
{
deFromula[dx] <- paste(deFromula[dx],
sprintf("%5.3f*%s",coef[cf],cname[cf]))
}
}
}
}
orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]
Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")
pander::pander(finalTable[,Final_Columns])
| DecorFormula | caseMean | caseStd | controlMean | controlStd | controlKSP | ROCAUC | RAWAUC | fscores | |
|---|---|---|---|---|---|---|---|---|---|
| ADAS11 | 0.614 | 0.924 | -0.2621 | 0.825 | 0.00086 | 0.761 | 0.761 | 1 | |
| FAQ | 0.832 | 1.082 | -0.0134 | 0.724 | 0.00000 | 0.756 | 0.756 | NA | |
| M_ST40CV | -0.579 | 0.905 | 0.2366 | 0.821 | 0.30865 | 0.750 | 0.750 | NA | |
| M_ST12SV | -0.546 | 0.844 | 0.2413 | 0.883 | 0.35129 | 0.744 | 0.744 | NA | |
| Hippocampus | -0.489 | 0.805 | 0.2658 | 0.869 | 0.22638 | 0.737 | 0.737 | 1 | |
| RAVLT_immediate | -0.374 | 0.704 | 0.3807 | 0.979 | 0.02525 | 0.728 | 0.728 | NA | |
| M_ST24CV | -0.568 | 0.959 | 0.2072 | 0.876 | 0.08763 | 0.727 | 0.727 | NA | |
| M_ST31CV | -0.496 | 0.889 | 0.2225 | 0.864 | 0.74725 | 0.717 | 0.717 | NA | |
| M_ST24TA | -0.557 | 0.902 | 0.1400 | 0.837 | 0.06250 | 0.717 | 0.717 | NA | |
| WholeBrain | -0.471 | 0.802 | 0.1953 | 0.855 | 0.21185 | 0.715 | 0.715 | NA |